# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# SUPPORT VECTOR MACHINES

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install seaborn;
! pip install ISLP;

import pandas as pd;
import numpy as np;
from sklearn.svm import SVC;
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score;
from sklearn.metrics import classification_report, confusion_matrix;
import matplotlib.pyplot as plt;
import seaborn as sns;
from ISLP import load_data;

# Load the Auto dataset from package ISLP
Auto = load_data('Auto')

# Create Economy (ECO) labels based on mpg
Auto['ECO'] = np.where(Auto['mpg'] > 22.75, 'Economy', 'Consuming')

# Visualize weight vs horsepower
sns.scatterplot(data=Auto, x='weight', y='horsepower', hue='ECO')
plt.xlabel('Weight')
plt.ylabel('Horsepower')
plt.title('Car Classification by Economy')
plt.show()

# Prepare dataset with necessary variables only
d = Auto[['ECO', 'weight', 'horsepower']]

# Perform SVM with linear kernel
X = d[['weight', 'horsepower']]
y = d['ECO']

svm_linear = SVC(kernel='linear', C=1)
svm_linear.fit(X, y)
print(f"Support Vectors:\n{svm_linear.support_vectors_}")

Support Vectors:
[[2833.   95.]
 [2774.   97.]
 [2587.   85.]
 [2648.   90.]
 [2634.  100.]
 [3302.   88.]
 [2962.  110.]
 [2408.   72.]
 [3139.   88.]
 [2408.   90.]
 [2226.   86.]
 [2330.   97.]
 [2933.  112.]
 [2511.   76.]
 [2979.   87.]
 [2395.   86.]
 [2945.  100.]
 [3021.   88.]
 [2789.  100.]
 [2279.   88.]
 [2401.   72.]
 [2379.   94.]
 [2124.   90.]
 [2310.   85.]
 [2472.  107.]
 [2582.   91.]
 [2868.  112.]
 [2807.  122.]
 [3102.   95.]
 [2901.  100.]
 [3432.   72.]
 [3158.   72.]
 [3039.  110.]
 [2914.  100.]
 [2984.   97.]
 [3211.   90.]
 [2945.   98.]
 [3085.   90.]
 [3193.   95.]
 [3150.  102.]
 [3270.   88.]
 [2930.  108.]
 [2815.   97.]
 [2600.  110.]
 [2720.  110.]
 [3155.   95.]
 [2965.   85.]
 [3210.   90.]
 [3070.   85.]
 [2515.   95.]
 [2830.  103.]
 [2795.  115.]
 [2990.   85.]
 [2890.   88.]
 [3265.   90.]
 [3060.   88.]
 [2835.  112.]
 [2672.   87.]
 [2234.  113.]
 [2506.   97.]
 [2904.   95.]
 [2660.  110.]
 [2489.   97.]
 [2639.   83.]
 [2702.   96.]
 [2545.   97.]
 [2694.   95.]
 [2957.   88.]
 [2671.  115.]
 [2572.   92.]
 [3012.   81.]
 [2740.   88.]
 [2755.   89.]
 [2720.   88.]
 [2560.   95.]
 [2745.  105.]
 [2855.   85.]
 [2670.   80.]
 [3530.   77.]
 [3900.  125.]
 [3190.   71.]
 [3420.   90.]
 [2670.   90.]
 [2595.  115.]
 [2700.  115.]
 [2556.   90.]
 [2678.   90.]
 [2870.   88.]
 [3003.   90.]
 [2711.   90.]
 [2800.  105.]
 [2950.   67.]
 [3250.   67.]
 [2910.  132.]
 [2420.  100.]
 [2635.   84.]
 [2620.   92.]
 [2725.  110.]
 [2615.  100.]
 [3230.   80.]
 [3160.   76.]
 [2900.  116.]
 [2930.  120.]
 [3725.  105.]
 [2605.   88.]
 [2640.   88.]
 [2735.   90.]
 [2865.   92.]
 [2945.  110.]
 [3015.   85.]
 [2585.   92.]
 [2665.   96.]
 [2950.   90.]
 [2790.   86.]
 [2720.   82.]]

# Plot the SVM with linear kernel
plt.scatter(X['weight'], X['horsepower'], c=y.apply(lambda x: 0 if x == "Consuming" else 1), cmap='coolwarm')
plt.scatter(svm_linear.support_vectors_[:, 0], svm_linear.support_vectors_[:, 1], s=100, facecolors='none', edgecolors='k')
plt.xlabel('Weight')
plt.ylabel('Horsepower')
plt.title('SVM with Linear Kernel')
plt.show()

# Try polynomial, radial, and sigmoid kernels
kernels = ['poly', 'rbf', 'sigmoid']
for kernel in kernels:
    svm = SVC(kernel=kernel, C=1)
    svm.fit(X, y)
    print(f"Kernel: {kernel}")
    print(f"Number of Support Vectors: {len(svm.support_vectors_)}")

Kernel: poly
Number of Support Vectors: 120
Kernel: rbf
Number of Support Vectors: 134
Kernel: sigmoid
Number of Support Vectors: 354

# Hyperparameter tuning with cross-validation
param_grid = {'C': np.logspace(-3, 3, 7)}
grid = GridSearchCV(SVC(kernel='linear'), param_grid, cv=10)
grid.fit(X, y)
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Score: {grid.best_score_}")

Best Parameters: {'C': 100.0}
Best Score: 0.8953205128205128

# Tuning with different kernels
param_grid_kernels = {
    'C': np.logspace(-3, 3, 7),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
grid_kernels = GridSearchCV(SVC(), param_grid_kernels, cv=10)
grid_kernels.fit(X, y)
print(f"Best Parameters with Kernel Tuning: {grid_kernels.best_params_}")
print(f"Best Score with Kernel Tuning: {grid_kernels.best_score_}")

Best Parameters with Kernel Tuning: {'C': 100.0, 'kernel': 'linear'}
Best Score with Kernel Tuning: 0.8953205128205128

# Train final model with optimal parameters
best_model = SVC(C=0.1, kernel='sigmoid')
best_model.fit(X, y)

SVC(C=0.1, kernel='sigmoid')

SVC(C=0.1, kernel='sigmoid')

# Evaluate on a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
model_val = SVC(C=0.1, kernel='sigmoid')
model_val.fit(X_train, y_train)
y_pred = model_val.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(f"Accuracy: {np.mean(y_pred == y_val)}")

[[  0 105]
 [  0  91]]
Accuracy: 0.4642857142857143

# More than two classes
# Create ECO4 categories based on mpg values
Auto['ECO4'] = pd.cut(Auto['mpg'], bins=[0, 17, 22.75, 29, np.inf], labels=['Consuming', 'OK', 'Good', 'Economy'])
d4 = Auto[['ECO4', 'weight', 'horsepower']]

# Train SVM with ECO4 categories
X4 = d4[['weight', 'horsepower']]
y4 = d4['ECO4']
svm_multi = SVC(C=0.1, kernel='sigmoid')
svm_multi.fit(X4, y4)

# Evaluate classification with more than two classes
y_pred_multi = svm_multi.predict(X4)
print(confusion_matrix(y4, y_pred_multi))
print(f"Accuracy with more classes: {np.mean(y_pred_multi == y4)}")

[[ 0  0 99  0]
 [62  0 33  0]
 [22  0 79  0]
 [ 1  0 96  0]]
Accuracy with more classes: 0.20153061224489796